This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.

dat<-read.csv('2019-06-13-exam-pa-data-file.csv')
summary(dat)
##   Crash_Score          year          Month        Time_of_Day   
##  Min.   : 0.010   Min.   :2014   Min.   : 1.00   Min.   :1.000  
##  1st Qu.: 3.540   1st Qu.:2015   1st Qu.: 3.00   1st Qu.:3.000  
##  Median : 5.660   Median :2016   Median : 7.00   Median :4.000  
##  Mean   : 6.567   Mean   :2016   Mean   : 6.56   Mean   :4.034  
##  3rd Qu.: 8.600   3rd Qu.:2017   3rd Qu.:10.00   3rd Qu.:5.000  
##  Max.   :53.070   Max.   :2019   Max.   :12.00   Max.   :6.000  
##                                                                 
##         Rd_Feature            Rd_Character        Rd_Class    
##  DRIVEWAY    : 2373   CURVE-GRADE   :  643   OTHER    : 9960  
##  INTERSECTION: 6702   CURVE-LEVEL   :  725   STATE HWY:10603  
##  NONE        :13025   CURVE-OTHER   :  239   US HWY   : 2574  
##  OTHER       :  259   OTHER         :   13                    
##  RAMP        :  778   STRAIGHT-GRADE: 2622                    
##                       STRAIGHT-LEVEL:18215                    
##                       STRAIGHT-OTHER:  680                    
##                    Rd_Configuration            Rd_Surface   
##  ONE-WAY                   : 1496   COARSE ASPHALT  : 1997  
##  TWO-WAY-NO-MEDIAN         :12076   CONCRETE        :  692  
##  TWO-WAY-PROTECTED-MEDIAN  : 2627   GROOVED CONCRETE:  371  
##  TWO-WAY-UNPROTECTED-MEDIAN: 6882   OTHER           :   70  
##  UNKNOWN                   :   56   SMOOTH ASPHALT  :20007  
##                                                             
##                                                             
##         Rd_Conditions            Light         Weather     
##  DRY           :19262   DARK-LIT    : 3219   CLEAR :17393  
##  ICE-SNOW-SLUSH:  322   DARK-NOT-LIT:  708   CLOUDY: 3234  
##  OTHER         :  134   DAWN        :  140   OTHER :   85  
##  WET           : 3419   DAYLIGHT    :18262   RAIN  : 2230  
##                         DUSK        :  602   SNOW  :  195  
##                         OTHER       :  206                 
##                                                            
##   Traffic_Control  Work_Area  
##  NONE     :14028   NO :22823  
##  OTHER    :  228   YES:  314  
##  SIGNAL   : 6352              
##  STOP-SIGN: 2269              
##  YIELD    :  260              
##                               
## 
vars<-colnames(dat)[5:14]
for (i in vars){
  table <- as.data.frame(table(dat[,i]))
  max <- which.max(table[,2])
  level.name <- as.character(table[max,1])
  dat[,i] <- relevel(dat[,i], ref = level.name)
}
summary(dat)
##   Crash_Score          year          Month        Time_of_Day   
##  Min.   : 0.010   Min.   :2014   Min.   : 1.00   Min.   :1.000  
##  1st Qu.: 3.540   1st Qu.:2015   1st Qu.: 3.00   1st Qu.:3.000  
##  Median : 5.660   Median :2016   Median : 7.00   Median :4.000  
##  Mean   : 6.567   Mean   :2016   Mean   : 6.56   Mean   :4.034  
##  3rd Qu.: 8.600   3rd Qu.:2017   3rd Qu.:10.00   3rd Qu.:5.000  
##  Max.   :53.070   Max.   :2019   Max.   :12.00   Max.   :6.000  
##                                                                 
##         Rd_Feature            Rd_Character        Rd_Class    
##  NONE        :13025   STRAIGHT-LEVEL:18215   STATE HWY:10603  
##  DRIVEWAY    : 2373   CURVE-GRADE   :  643   OTHER    : 9960  
##  INTERSECTION: 6702   CURVE-LEVEL   :  725   US HWY   : 2574  
##  OTHER       :  259   CURVE-OTHER   :  239                    
##  RAMP        :  778   OTHER         :   13                    
##                       STRAIGHT-GRADE: 2622                    
##                       STRAIGHT-OTHER:  680                    
##                    Rd_Configuration            Rd_Surface   
##  TWO-WAY-NO-MEDIAN         :12076   SMOOTH ASPHALT  :20007  
##  ONE-WAY                   : 1496   COARSE ASPHALT  : 1997  
##  TWO-WAY-PROTECTED-MEDIAN  : 2627   CONCRETE        :  692  
##  TWO-WAY-UNPROTECTED-MEDIAN: 6882   GROOVED CONCRETE:  371  
##  UNKNOWN                   :   56   OTHER           :   70  
##                                                             
##                                                             
##         Rd_Conditions            Light         Weather     
##  DRY           :19262   DAYLIGHT    :18262   CLEAR :17393  
##  ICE-SNOW-SLUSH:  322   DARK-LIT    : 3219   CLOUDY: 3234  
##  OTHER         :  134   DARK-NOT-LIT:  708   OTHER :   85  
##  WET           : 3419   DAWN        :  140   RAIN  : 2230  
##                         DUSK        :  602   SNOW  :  195  
##                         OTHER       :  206                 
##                                                            
##   Traffic_Control  Work_Area  
##  NONE     :14028   NO :22823  
##  OTHER    :  228   YES:  314  
##  SIGNAL   : 6352              
##  STOP-SIGN: 2269              
##  YIELD    :  260              
##                               
## 

Task 1

library(ggplot2)
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
p<-ggplot(dat,aes(x=Crash_Score))+geom_histogram()
p
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

vars <- colnames(dat)[colnames(dat)!="Crash_Score"]
for (i in vars) {
  plot <- ggplot(dat, aes(x=as.factor(dat[,i]),y=Crash_Score)) + geom_boxplot() + labs(x=i)
  print(plot)
}

library(plyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
for (i in vars) {
  print(i)
  x <- dat %>% group_by_(i)%>%summarise(mean=mean(Crash_Score),
                                   median=median(Crash_Score),
                                   n = n())

  print(x)
}
## [1] "year"
## Warning: group_by_() is deprecated. 
## Please use group_by() instead
## 
## The 'programming' vignette or the tidyeval book can help you
## to program with group_by() : https://tidyeval.tidyverse.org
## This warning is displayed once per session.
## # A tibble: 6 x 4
##    year  mean median     n
##   <int> <dbl>  <dbl> <int>
## 1  2014  6.62   5.77  3928
## 2  2015  6.53   5.58  4402
## 3  2016  6.62   5.68  4651
## 4  2017  6.58   5.66  4558
## 5  2018  6.52   5.63  4833
## 6  2019  6.42   5.6    765
## [1] "Month"
## # A tibble: 12 x 4
##    Month  mean median     n
##    <int> <dbl>  <dbl> <int>
##  1     1  6.56   5.76  2052
##  2     2  6.67   5.66  1947
##  3     3  6.54   5.67  1887
##  4     4  6.61   5.68  1705
##  5     5  6.52   5.71  2013
##  6     6  6.52   5.53  1864
##  7     7  6.51   5.65  1671
##  8     8  6.53   5.7   1926
##  9     9  6.54   5.65  1843
## 10    10  6.65   5.64  2186
## 11    11  6.48   5.57  1972
## 12    12  6.66   5.65  2071
## [1] "Time_of_Day"
## # A tibble: 6 x 4
##   Time_of_Day  mean median     n
##         <int> <dbl>  <dbl> <int>
## 1           1  5.87   4.86   808
## 2           2  6.38   5.44  1627
## 3           3  6.63   5.7   4827
## 4           4  6.55   5.71  6939
## 5           5  6.74   5.79  6579
## 6           6  6.38   5.51  2357
## [1] "Rd_Feature"
## # A tibble: 5 x 4
##   Rd_Feature    mean median     n
##   <fct>        <dbl>  <dbl> <int>
## 1 NONE          6.38   5.45 13025
## 2 DRIVEWAY      6.23   5.52  2373
## 3 INTERSECTION  7.08   6.2   6702
## 4 OTHER         6.35   5.25   259
## 5 RAMP          6.29   5.50   778
## [1] "Rd_Character"
## # A tibble: 7 x 4
##   Rd_Character    mean median     n
##   <fct>          <dbl>  <dbl> <int>
## 1 STRAIGHT-LEVEL  6.60   5.67 18215
## 2 CURVE-GRADE     6.09   5.06   643
## 3 CURVE-LEVEL     6.15   5.06   725
## 4 CURVE-OTHER     6.78   5.44   239
## 5 OTHER           5.66   4.16    13
## 6 STRAIGHT-GRADE  6.61   5.86  2622
## 7 STRAIGHT-OTHER  6.41   5.63   680
## [1] "Rd_Class"
## # A tibble: 3 x 4
##   Rd_Class   mean median     n
##   <fct>     <dbl>  <dbl> <int>
## 1 STATE HWY  6.90   5.98 10603
## 2 OTHER      6.15   5.35  9960
## 3 US HWY     6.79   5.65  2574
## [1] "Rd_Configuration"
## # A tibble: 5 x 4
##   Rd_Configuration            mean median     n
##   <fct>                      <dbl>  <dbl> <int>
## 1 TWO-WAY-NO-MEDIAN           6.40   5.53 12076
## 2 ONE-WAY                     6.26   5.38  1496
## 3 TWO-WAY-PROTECTED-MEDIAN    6.82   5.76  2627
## 4 TWO-WAY-UNPROTECTED-MEDIAN  6.84   5.96  6882
## 5 UNKNOWN                     6.30   5.46    56
## [1] "Rd_Surface"
## # A tibble: 5 x 4
##   Rd_Surface        mean median     n
##   <fct>            <dbl>  <dbl> <int>
## 1 SMOOTH ASPHALT    6.57   5.67 20007
## 2 COARSE ASPHALT    6.64   5.75  1997
## 3 CONCRETE          6.14   5.11   692
## 4 GROOVED CONCRETE  6.73   5.8    371
## 5 OTHER             5.99   4.97    70
## [1] "Rd_Conditions"
## # A tibble: 4 x 4
##   Rd_Conditions   mean median     n
##   <fct>          <dbl>  <dbl> <int>
## 1 DRY             6.57   5.66 19262
## 2 ICE-SNOW-SLUSH  6.59   5.87   322
## 3 OTHER           6.23   5.63   134
## 4 WET             6.54   5.62  3419
## [1] "Light"
## # A tibble: 6 x 4
##   Light         mean median     n
##   <fct>        <dbl>  <dbl> <int>
## 1 DAYLIGHT      6.66   5.75 18262
## 2 DARK-LIT      6.24   5.4   3219
## 3 DARK-NOT-LIT  6.00   4.76   708
## 4 DAWN          6.32   5.44   140
## 5 DUSK          6.70   5.84   602
## 6 OTHER         5.36   4.33   206
## [1] "Weather"
## # A tibble: 5 x 4
##   Weather  mean median     n
##   <fct>   <dbl>  <dbl> <int>
## 1 CLEAR    6.57   5.65 17393
## 2 CLOUDY   6.52   5.68  3234
## 3 OTHER    5.65   4.84    85
## 4 RAIN     6.65   5.68  2230
## 5 SNOW     6.71   5.98   195
## [1] "Traffic_Control"
## # A tibble: 5 x 4
##   Traffic_Control  mean median     n
##   <fct>           <dbl>  <dbl> <int>
## 1 NONE             6.29   5.4  14028
## 2 OTHER            6.65   5.61   228
## 3 SIGNAL           7.04   6.14  6352
## 4 STOP-SIGN        6.95   6.14  2269
## 5 YIELD            6.60   5.56   260
## [1] "Work_Area"
## # A tibble: 2 x 4
##   Work_Area  mean median     n
##   <fct>     <dbl>  <dbl> <int>
## 1 NO         6.56   5.65 22823
## 2 YES        7.19   5.96   314

Task 2

vars <- colnames(dat)[colnames(dat)!="Crash_Score"]
for (i in vars) {
  plot <- ggplot(dat, aes(x=dat[,i])) + geom_bar() + labs(x=i) + theme(axis.text.x = element_text(angle = 90, hjust = 1))
  print(plot)
}

dat2<-dat

library(plyr)
var<-'Traffic_Control'
var.levels<-levels(dat2[,var])
dat2[,var]<-mapvalues(dat2[,var],var.levels,c('OTHER','OTHER','SIGNAL-STOP','SIGNAL-STOP','OTHER'))

table<-as.data.frame(table(dat2[,var]))
max<-which.max(table[,2])
level.name<-as.character(table[max,1])
dat2[,var]<-relevel(dat2[,var],ref=level.name)

table(dat2[,var])
## 
##       OTHER SIGNAL-STOP 
##       14516        8621
var<-'Rd_Character'
dat2[,var]<-as.factor(dat2[,var])
var.levels<-levels(dat2[,var])
dat2[,var]<-mapvalues(dat2[,var],var.levels,c('STRAIGHT','CURVE','CURVE','CURVE','CURVE','STRAIGHT','STRAIGHT'))
table <- as.data.frame(table(dat2[,var]))
  max <- which.max(table[,2])
  level.name <- as.character(table[max,1])
  dat2[,var] <- relevel(dat2[,var], ref = level.name)
table(dat2[,var])
## 
## STRAIGHT    CURVE 
##    21517     1620
var<-'Rd_Feature'
dat2[,var]<-as.factor(dat2[,var])
var.levels<-levels(dat2[,var])
dat2[,var]<-mapvalues(dat2[,var],var.levels,c('OTHER','OTHER','INTERSECTION','OTHER','OTHER'))
table <- as.data.frame(table(dat2[,var]))
  max <- which.max(table[,2])
  level.name <- as.character(table[max,1])
  dat2[,var] <- relevel(dat2[,var], ref = level.name)
table(dat2[,var])
## 
##        OTHER INTERSECTION 
##        16435         6702
var<-'Time_of_Day'
dat2[,var]<-as.factor(dat2[,var])
var.levels<-levels(dat2[,var])
dat2[,var]<-mapvalues(dat2[,var],var.levels,c('OVERNIGHT','LATE-EARLY','DAYTIME','DAYTIME','DAYTIME','LATE-NIGHT'))
table <- as.data.frame(table(dat2[,var]))
  max <- which.max(table[,2])
  level.name <- as.character(table[max,1])
  dat2[,var] <- relevel(dat2[,var], ref = level.name)
table(dat2[,var])
## 
##    DAYTIME  OVERNIGHT LATE-EARLY LATE-NIGHT 
##      18345        808       1627       2357
var<-'Rd_Surface'
dat2[,var] <- as.factor(dat2[,var])
var.levels <- levels(dat2[,var])
dat2[,var] <- mapvalues(dat2[,var],var.levels,c('ASPHALT','ASPHALT','OTHER','OTHER','OTHER'))
table <- as.data.frame(table(dat2[,var]))
  max <- which.max(table[,2])
  level.name <- as.character(table[max,1])
  dat2[,var] <- relevel(dat2[,var], ref = level.name)
table(dat2[,var])
## 
## ASPHALT   OTHER 
##   22004    1133

Task 3

datPCA<-dat[,c('Rd_Conditions','Light','Weather')]

library(caret)
## Loading required package: lattice
varsPCA<-colnames(datPCA)
for (var in varsPCA) {
  datPCA[,var]<-as.character(datPCA[,var])
}

binarizer<-caret::dummyVars(paste('~',paste(varsPCA,collapse='+')),data=datPCA,fullRank=FALSE)
datPCAbin<-data.frame(predict(binarizer,newdata=datPCA))
head(datPCAbin)
PCAweather <- prcomp(datPCAbin, center = TRUE, scale. = TRUE)
summary(PCAweather)
## Importance of components:
##                          PC1    PC2    PC3    PC4     PC5     PC6     PC7
## Standard deviation     1.829 1.3740 1.2796 1.2379 1.14429 1.03216 1.01236
## Proportion of Variance 0.223 0.1259 0.1092 0.1022 0.08729 0.07102 0.06833
## Cumulative Proportion  0.223 0.3489 0.4580 0.5602 0.64748 0.71851 0.78683
##                           PC8    PC9    PC10    PC11    PC12      PC13
## Standard deviation     1.0033 0.9174 0.79731 0.64583 0.54470 5.436e-15
## Proportion of Variance 0.0671 0.0561 0.04238 0.02781 0.01978 0.000e+00
## Cumulative Proportion  0.8539 0.9100 0.95241 0.98022 1.00000 1.000e+00
##                           PC14      PC15
## Standard deviation     2.2e-15 1.729e-15
## Proportion of Variance 0.0e+00 0.000e+00
## Cumulative Proportion  1.0e+00 1.000e+00
PCAweather$rotation
##                                     PC1         PC2          PC3
## Rd_ConditionsDRY            -0.51165971  0.03279495 -0.074984796
## Rd_ConditionsICE.SNOW.SLUSH  0.09037524  0.08506534  0.662448145
## Rd_ConditionsOTHER           0.05610221  0.18320852  0.103092721
## Rd_ConditionsWET             0.49654749 -0.10176327 -0.161823749
## LightDARK.LIT                0.11584644  0.52794265 -0.134963861
## LightDARK.NOT.LIT            0.05371675  0.19840327 -0.012771256
## LightDAWN                    0.03037488  0.07312351  0.008834873
## LightDAYLIGHT               -0.14979749 -0.66027088  0.122825366
## LightDUSK                    0.04011811  0.17211754 -0.069299885
## LightOTHER                   0.03196240  0.20556965  0.097572239
## WeatherCLEAR                -0.45856690  0.18940018 -0.043504511
## WeatherCLOUDY                0.16796308 -0.22634633  0.028404961
## WeatherOTHER                 0.05593982  0.14313571  0.095611440
## WeatherRAIN                  0.43250589 -0.06252514 -0.190678603
## WeatherSNOW                  0.09667013  0.07063727  0.650123103
##                                     PC4         PC5          PC6
## Rd_ConditionsDRY             0.01484130 -0.18123640 -0.000304546
## Rd_ConditionsICE.SNOW.SLUSH  0.17309970  0.03263430 -0.003229391
## Rd_ConditionsOTHER          -0.59102357  0.08185876  0.013871952
## Rd_ConditionsWET             0.05361910  0.16242361 -0.001579706
## LightDARK.LIT                0.20553848 -0.15082249  0.524023896
## LightDARK.NOT.LIT            0.08325274 -0.02460026 -0.666073761
## LightDAWN                    0.04478967 -0.08625994 -0.148703516
## LightDAYLIGHT               -0.11844823  0.19122929  0.064770906
## LightDUSK                    0.03224585 -0.14753349 -0.501319635
## LightOTHER                  -0.48728603  0.09181326 -0.018210396
## WeatherCLEAR                 0.08594496  0.35125976 -0.020935930
## WeatherCLOUDY               -0.14707914 -0.76095953  0.031641146
## WeatherOTHER                -0.49208702  0.06877513  0.035509159
## WeatherRAIN                  0.09025035  0.35702863 -0.020451317
## WeatherSNOW                  0.18605948  0.02838991  0.021438779
##                                      PC7          PC8          PC9
## Rd_ConditionsDRY            -0.015248814 -0.007849658 -0.003938895
## Rd_ConditionsICE.SNOW.SLUSH  0.028817357 -0.020239216 -0.011409533
## Rd_ConditionsOTHER          -0.015819945  0.004441833  0.003261074
## Rd_ConditionsWET             0.009914945  0.013990907  0.007213656
## LightDARK.LIT               -0.050731848 -0.067682247 -0.062223343
## LightDARK.NOT.LIT           -0.624270732 -0.165555370 -0.071559483
## LightDAWN                    0.005078405  0.969593680 -0.066722229
## LightDAYLIGHT                0.007612700 -0.009937129 -0.049517135
## LightDUSK                    0.771261686 -0.147469720 -0.029538956
## LightOTHER                  -0.012772847  0.045476572  0.680552997
## WeatherCLEAR                 0.027257542  0.030792065  0.019213521
## WeatherCLOUDY               -0.080936296 -0.038591035  0.063032700
## WeatherOTHER                 0.018306649 -0.019886896 -0.716703836
## WeatherRAIN                  0.036838111  0.010225294  0.039016123
## WeatherSNOW                  0.047153728 -0.018980562  0.018478936
##                                    PC10         PC11         PC12
## Rd_ConditionsDRY            -0.10561246 -0.174404093  0.408737115
## Rd_ConditionsICE.SNOW.SLUSH -0.02298359  0.667192980  0.139630559
## Rd_ConditionsOTHER           0.75494351 -0.019528178  0.075105047
## Rd_ConditionsWET            -0.04271335 -0.032559601 -0.492233199
## LightDARK.LIT                0.04761367  0.005227977  0.014691286
## LightDARK.NOT.LIT            0.01708870 -0.022654370  0.002144386
## LightDAWN                    0.01903033 -0.004732147  0.014857170
## LightDAYLIGHT                0.04151241  0.006230309 -0.009865851
## LightDUSK                    0.03318236  0.002382120 -0.008017564
## LightOTHER                  -0.45890809 -0.004901907 -0.013905055
## WeatherCLEAR                 0.06431667  0.122166890 -0.374060795
## WeatherCLOUDY                0.01221875  0.071474866 -0.067919098
## WeatherOTHER                -0.44168092 -0.023803584 -0.006415880
## WeatherRAIN                 -0.02332117 -0.038841641  0.645813166
## WeatherSNOW                  0.01730821 -0.707287146 -0.055286182
##                                   PC13        PC14        PC15
## Rd_ConditionsDRY            0.39278871 -0.13988638  0.56181979
## Rd_ConditionsICE.SNOW.SLUSH 0.12322835 -0.04388611  0.17625793
## Rd_ConditionsOTHER          0.07982099 -0.02842716  0.11417082
## Rd_ConditionsWET            0.37329609 -0.13294435  0.53393879
## LightDARK.LIT               0.08964679 -0.53831132 -0.19670839
## LightDARK.NOT.LIT           0.04461418 -0.26789940 -0.09789514
## LightDAWN                   0.02008866 -0.12062846 -0.04407975
## LightDAYLIGHT               0.10563622 -0.63432472 -0.23179337
## LightDUSK                   0.04123618 -0.24761513 -0.09048291
## LightOTHER                  0.02433305 -0.14611516 -0.05339304
## WeatherCLEAR                0.55120660  0.21321542 -0.33228056
## WeatherCLOUDY               0.44243475  0.17114075 -0.26671028
## WeatherOTHER                0.07719403  0.02985987 -0.04653441
## WeatherRAIN                 0.37654617  0.14565401 -0.22699107
## WeatherSNOW                 0.11664144  0.04511875 -0.07031426
datPCAbin.std<-as.data.frame(scale(datPCAbin))
#dat2<-dat
dat2$WETorDRY<-0.51*datPCAbin.std$Rd_ConditionsDRY+0.5*datPCAbin.std$Rd_ConditionsWET-0.46*datPCAbin.std$WeatherCLEAR+0.43*datPCAbin.std$WeatherRAIN
summary(dat2$WETorDRY)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -1.7500 -0.3842 -0.3842  0.0000 -0.3842  2.1807
dat<-dat2
dat$Rd_Conditions<-NULL
dat$Weather<-NULL
summary(dat)
##   Crash_Score          year          Month           Time_of_Day   
##  Min.   : 0.010   Min.   :2014   Min.   : 1.00   DAYTIME   :18345  
##  1st Qu.: 3.540   1st Qu.:2015   1st Qu.: 3.00   OVERNIGHT :  808  
##  Median : 5.660   Median :2016   Median : 7.00   LATE-EARLY: 1627  
##  Mean   : 6.567   Mean   :2016   Mean   : 6.56   LATE-NIGHT: 2357  
##  3rd Qu.: 8.600   3rd Qu.:2017   3rd Qu.:10.00                     
##  Max.   :53.070   Max.   :2019   Max.   :12.00                     
##         Rd_Feature      Rd_Character        Rd_Class    
##  OTHER       :16435   STRAIGHT:21517   STATE HWY:10603  
##  INTERSECTION: 6702   CURVE   : 1620   OTHER    : 9960  
##                                        US HWY   : 2574  
##                                                         
##                                                         
##                                                         
##                    Rd_Configuration   Rd_Surface             Light      
##  TWO-WAY-NO-MEDIAN         :12076   ASPHALT:22004   DAYLIGHT    :18262  
##  ONE-WAY                   : 1496   OTHER  : 1133   DARK-LIT    : 3219  
##  TWO-WAY-PROTECTED-MEDIAN  : 2627                   DARK-NOT-LIT:  708  
##  TWO-WAY-UNPROTECTED-MEDIAN: 6882                   DAWN        :  140  
##  UNKNOWN                   :   56                   DUSK        :  602  
##                                                     OTHER       :  206  
##     Traffic_Control  Work_Area      WETorDRY      
##  OTHER      :14516   NO :22823   Min.   :-1.7500  
##  SIGNAL-STOP: 8621   YES:  314   1st Qu.:-0.3842  
##                                  Median :-0.3842  
##                                  Mean   : 0.0000  
##                                  3rd Qu.:-0.3842  
##                                  Max.   : 2.1807

Task 4

ggplot(dat,aes(x=Rd_Character,y=log(Crash_Score),fill=Rd_Class))+
  geom_boxplot()+
  facet_wrap(~Rd_Character,scale="free")

ggplot(dat,aes(x=Traffic_Control,y=log(Crash_Score),fill=Rd_Feature))+
  geom_boxplot()+
  facet_wrap(~Traffic_Control,scale="free")

Task 5

dat$Month<-as.factor(dat$Month)
levels(dat$Month)
##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12"
library(caret)
set.seed(1234)

partition<-createDataPartition(dat$Crash_Score,list=FALSE,p=.75)
train<-dat[partition, ]
test<-dat[-partition, ]

print('TRAIN')
## [1] "TRAIN"
mean(train$Crash_Score)
## [1] 6.561156
print('TEST')
## [1] "TEST"
mean(test$Crash_Score)
## [1] 6.58402
GLMols<-glm(Crash_Score~.,family = gaussian(),data=train)
summary(GLMols)
## 
## Call:
## glm(formula = Crash_Score ~ ., family = gaussian(), data = train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -7.119  -2.970  -0.876   2.011  47.205  
## 
## Coefficients:
##                                              Estimate Std. Error t value
## (Intercept)                                103.378410  44.597458   2.318
## year                                        -0.047954   0.022115  -2.168
## Month2                                       0.228312   0.154849   1.474
## Month3                                       0.110909   0.156551   0.708
## Month4                                       0.003026   0.161885   0.019
## Month5                                       0.002956   0.153889   0.019
## Month6                                      -0.127018   0.158568  -0.801
## Month7                                      -0.100649   0.161562  -0.623
## Month8                                      -0.058145   0.155699  -0.373
## Month9                                       0.004877   0.158549   0.031
## Month10                                      0.167344   0.151545   1.104
## Month11                                     -0.023279   0.155254  -0.150
## Month12                                      0.331391   0.153274   2.162
## Time_of_DayOVERNIGHT                        -0.535530   0.190537  -2.811
## Time_of_DayLATE-EARLY                       -0.160209   0.130249  -1.230
## Time_of_DayLATE-NIGHT                       -0.096836   0.125186  -0.774
## Rd_FeatureINTERSECTION                       0.335777   0.093745   3.582
## Rd_CharacterCURVE                           -0.210142   0.128721  -1.633
## Rd_ClassOTHER                               -0.543359   0.080441  -6.755
## Rd_ClassUS HWY                               0.110509   0.131862   0.838
## Rd_ConfigurationONE-WAY                     -0.134412   0.141906  -0.947
## Rd_ConfigurationTWO-WAY-PROTECTED-MEDIAN     0.089917   0.129186   0.696
## Rd_ConfigurationTWO-WAY-UNPROTECTED-MEDIAN  -0.025951   0.084510  -0.307
## Rd_ConfigurationUNKNOWN                      0.395609   0.635595   0.622
## Rd_SurfaceOTHER                             -0.264032   0.161973  -1.630
## LightDARK-LIT                               -0.317882   0.113729  -2.795
## LightDARK-NOT-LIT                           -0.666567   0.200865  -3.318
## LightDAWN                                   -0.175518   0.435735  -0.403
## LightDUSK                                   -0.183188   0.208614  -0.878
## LightOTHER                                  -0.734078   0.348821  -2.104
## Traffic_ControlSIGNAL-STOP                   0.235240   0.088518   2.658
## Work_AreaYES                                 0.317833   0.278329   1.142
## WETorDRY                                    -0.005966   0.040202  -0.148
##                                            Pr(>|t|)    
## (Intercept)                                0.020459 *  
## year                                       0.030144 *  
## Month2                                     0.140387    
## Month3                                     0.478672    
## Month4                                     0.985085    
## Month5                                     0.984675    
## Month6                                     0.423124    
## Month7                                     0.533308    
## Month8                                     0.708821    
## Month9                                     0.975462    
## Month10                                    0.269498    
## Month11                                    0.880814    
## Month12                                    0.030625 *  
## Time_of_DayOVERNIGHT                       0.004950 ** 
## Time_of_DayLATE-EARLY                      0.218705    
## Time_of_DayLATE-NIGHT                      0.439218    
## Rd_FeatureINTERSECTION                     0.000342 ***
## Rd_CharacterCURVE                          0.102585    
## Rd_ClassOTHER                              1.48e-11 ***
## Rd_ClassUS HWY                             0.402009    
## Rd_ConfigurationONE-WAY                    0.343556    
## Rd_ConfigurationTWO-WAY-PROTECTED-MEDIAN   0.486420    
## Rd_ConfigurationTWO-WAY-UNPROTECTED-MEDIAN 0.758790    
## Rd_ConfigurationUNKNOWN                    0.533672    
## Rd_SurfaceOTHER                            0.103099    
## LightDARK-LIT                              0.005194 ** 
## LightDARK-NOT-LIT                          0.000907 ***
## LightDAWN                                  0.687094    
## LightDUSK                                  0.379890    
## LightOTHER                                 0.035353 *  
## Traffic_ControlSIGNAL-STOP                 0.007879 ** 
## Work_AreaYES                               0.253498    
## WETorDRY                                   0.882018    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 17.97454)
## 
##     Null deviance: 315875  on 17353  degrees of freedom
## Residual deviance: 311337  on 17321  degrees of freedom
## AIC: 99418
## 
## Number of Fisher Scoring iterations: 2
print('AIC')
## [1] "AIC"
AIC(GLMols)
## [1] 99418.44
predict<-predict(GLMols,newdata=test,type='response')
print('RMSE')
## [1] "RMSE"
sqrt(sum((test$Crash_Score-predict)^2)/nrow(test))
## [1] 4.285195
GLMgamma<-glm(Crash_Score~.+Traffic_Control:Rd_Feature,family = Gamma(link = 'log'),data=train)
summary(GLMgamma)
## 
## Call:
## glm(formula = Crash_Score ~ . + Traffic_Control:Rd_Feature, family = Gamma(link = "log"), 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.3179  -0.5544  -0.1435   0.2800   3.4186  
## 
## Coefficients:
##                                                     Estimate Std. Error
## (Intercept)                                       16.4006471  6.8032872
## year                                              -0.0071933  0.0033737
## Month2                                             0.0347164  0.0236225
## Month3                                             0.0159515  0.0238826
## Month4                                             0.0004506  0.0246954
## Month5                                             0.0004134  0.0234759
## Month6                                            -0.0194821  0.0241895
## Month7                                            -0.0162322  0.0246461
## Month8                                            -0.0089946  0.0237518
## Month9                                            -0.0008317  0.0241880
## Month10                                            0.0248085  0.0231181
## Month11                                           -0.0056385  0.0236839
## Month12                                            0.0510152  0.0233827
## Time_of_DayOVERNIGHT                              -0.0900418  0.0290688
## Time_of_DayLATE-EARLY                             -0.0244010  0.0198718
## Time_of_DayLATE-NIGHT                             -0.0141743  0.0190982
## Rd_FeatureINTERSECTION                             0.0679312  0.0246778
## Rd_CharacterCURVE                                 -0.0338970  0.0196827
## Rd_ClassOTHER                                     -0.0819679  0.0123015
## Rd_ClassUS HWY                                     0.0180421  0.0201241
## Rd_ConfigurationONE-WAY                           -0.0215608  0.0216656
## Rd_ConfigurationTWO-WAY-PROTECTED-MEDIAN           0.0140325  0.0197109
## Rd_ConfigurationTWO-WAY-UNPROTECTED-MEDIAN        -0.0047520  0.0128943
## Rd_ConfigurationUNKNOWN                            0.0740870  0.0969596
## Rd_SurfaceOTHER                                   -0.0423269  0.0247223
## LightDARK-LIT                                     -0.0505747  0.0173505
## LightDARK-NOT-LIT                                 -0.1049072  0.0306462
## LightDAWN                                         -0.0263254  0.0664712
## LightDUSK                                         -0.0266138  0.0318238
## LightOTHER                                        -0.1190162  0.0532127
## Traffic_ControlSIGNAL-STOP                         0.0460243  0.0157969
## Work_AreaYES                                       0.0499734  0.0424600
## WETorDRY                                          -0.0017194  0.0061328
## Rd_FeatureINTERSECTION:Traffic_ControlSIGNAL-STOP -0.0278197  0.0298016
##                                                   t value Pr(>|t|)    
## (Intercept)                                         2.411  0.01593 *  
## year                                               -2.132  0.03300 *  
## Month2                                              1.470  0.14168    
## Month3                                              0.668  0.50420    
## Month4                                              0.018  0.98544    
## Month5                                              0.018  0.98595    
## Month6                                             -0.805  0.42060    
## Month7                                             -0.659  0.51015    
## Month8                                             -0.379  0.70492    
## Month9                                             -0.034  0.97257    
## Month10                                             1.073  0.28323    
## Month11                                            -0.238  0.81183    
## Month12                                             2.182  0.02914 *  
## Time_of_DayOVERNIGHT                               -3.098  0.00195 ** 
## Time_of_DayLATE-EARLY                              -1.228  0.21949    
## Time_of_DayLATE-NIGHT                              -0.742  0.45799    
## Rd_FeatureINTERSECTION                              2.753  0.00592 ** 
## Rd_CharacterCURVE                                  -1.722  0.08506 .  
## Rd_ClassOTHER                                      -6.663 2.76e-11 ***
## Rd_ClassUS HWY                                      0.897  0.36998    
## Rd_ConfigurationONE-WAY                            -0.995  0.31967    
## Rd_ConfigurationTWO-WAY-PROTECTED-MEDIAN            0.712  0.47653    
## Rd_ConfigurationTWO-WAY-UNPROTECTED-MEDIAN         -0.369  0.71248    
## Rd_ConfigurationUNKNOWN                             0.764  0.44482    
## Rd_SurfaceOTHER                                    -1.712  0.08690 .  
## LightDARK-LIT                                      -2.915  0.00356 ** 
## LightDARK-NOT-LIT                                  -3.423  0.00062 ***
## LightDAWN                                          -0.396  0.69208    
## LightDUSK                                          -0.836  0.40301    
## LightOTHER                                         -2.237  0.02532 *  
## Traffic_ControlSIGNAL-STOP                          2.913  0.00358 ** 
## Work_AreaYES                                        1.177  0.23923    
## WETorDRY                                           -0.280  0.77920    
## Rd_FeatureINTERSECTION:Traffic_ControlSIGNAL-STOP  -0.933  0.35058    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 0.4182877)
## 
##     Null deviance: 7311.8  on 17353  degrees of freedom
## Residual deviance: 7204.0  on 17320  degrees of freedom
## AIC: 93235
## 
## Number of Fisher Scoring iterations: 5
print('AIC')
## [1] "AIC"
AIC(GLMgamma)
## [1] 93235.01
predict<-predict(GLMgamma,newdata=test,type='response')
print('RMSE')
## [1] "RMSE"
sqrt(sum((test$Crash_Score-predict)^2)/nrow(test))
## [1] 4.285252
GLMnorm<-glm(Crash_Score~.+Traffic_Control:Rd_Feature,family = gaussian(link = 'log'),data=train)
summary(GLMnorm)
## 
## Call:
## glm(formula = Crash_Score ~ . + Traffic_Control:Rd_Feature, family = gaussian(link = "log"), 
##     data = train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -7.110  -2.966  -0.881   2.013  47.225  
## 
## Coefficients:
##                                                     Estimate Std. Error
## (Intercept)                                       16.9230377  6.7748674
## year                                              -0.0074520  0.0033596
## Month2                                             0.0346130  0.0234388
## Month3                                             0.0177666  0.0238523
## Month4                                             0.0004638  0.0248062
## Month5                                             0.0003880  0.0236406
## Month6                                            -0.0198460  0.0245978
## Month7                                            -0.0146273  0.0249741
## Month8                                            -0.0088130  0.0240056
## Month9                                             0.0021391  0.0242806
## Month10                                            0.0256225  0.0229781
## Month11                                           -0.0009390  0.0239097
## Month12                                            0.0488685  0.0230333
## Time_of_DayOVERNIGHT                              -0.0867608  0.0321548
## Time_of_DayLATE-EARLY                             -0.0239232  0.0199603
## Time_of_DayLATE-NIGHT                             -0.0154884  0.0195101
## Rd_FeatureINTERSECTION                             0.0704499  0.0236039
## Rd_CharacterCURVE                                 -0.0338879  0.0203990
## Rd_ClassOTHER                                     -0.0832792  0.0123231
## Rd_ClassUS HWY                                     0.0190764  0.0194776
## Rd_ConfigurationONE-WAY                           -0.0244057  0.0227316
## Rd_ConfigurationTWO-WAY-PROTECTED-MEDIAN           0.0131801  0.0190852
## Rd_ConfigurationTWO-WAY-UNPROTECTED-MEDIAN        -0.0032949  0.0125138
## Rd_ConfigurationUNKNOWN                            0.0515541  0.0993109
## Rd_SurfaceOTHER                                   -0.0391858  0.0255640
## LightDARK-LIT                                     -0.0464936  0.0176972
## LightDARK-NOT-LIT                                 -0.1063687  0.0333406
## LightDAWN                                         -0.0259004  0.0669502
## LightDUSK                                         -0.0294692  0.0318988
## LightOTHER                                        -0.1330737  0.0637014
## Traffic_ControlSIGNAL-STOP                         0.0418374  0.0155124
## Work_AreaYES                                       0.0438690  0.0401251
## WETorDRY                                           0.0001240  0.0060803
## Rd_FeatureINTERSECTION:Traffic_ControlSIGNAL-STOP -0.0306663  0.0284740
##                                                   t value Pr(>|t|)    
## (Intercept)                                         2.498  0.01250 *  
## year                                               -2.218  0.02656 *  
## Month2                                              1.477  0.13976    
## Month3                                              0.745  0.45637    
## Month4                                              0.019  0.98508    
## Month5                                              0.016  0.98690    
## Month6                                             -0.807  0.41978    
## Month7                                             -0.586  0.55809    
## Month8                                             -0.367  0.71353    
## Month9                                              0.088  0.92980    
## Month10                                             1.115  0.26483    
## Month11                                            -0.039  0.96867    
## Month12                                             2.122  0.03388 *  
## Time_of_DayOVERNIGHT                               -2.698  0.00698 ** 
## Time_of_DayLATE-EARLY                              -1.199  0.23072    
## Time_of_DayLATE-NIGHT                              -0.794  0.42728    
## Rd_FeatureINTERSECTION                              2.985  0.00284 ** 
## Rd_CharacterCURVE                                  -1.661  0.09668 .  
## Rd_ClassOTHER                                      -6.758 1.44e-11 ***
## Rd_ClassUS HWY                                      0.979  0.32739    
## Rd_ConfigurationONE-WAY                            -1.074  0.28300    
## Rd_ConfigurationTWO-WAY-PROTECTED-MEDIAN            0.691  0.48983    
## Rd_ConfigurationTWO-WAY-UNPROTECTED-MEDIAN         -0.263  0.79232    
## Rd_ConfigurationUNKNOWN                             0.519  0.60368    
## Rd_SurfaceOTHER                                    -1.533  0.12533    
## LightDARK-LIT                                      -2.627  0.00862 ** 
## LightDARK-NOT-LIT                                  -3.190  0.00142 ** 
## LightDAWN                                          -0.387  0.69886    
## LightDUSK                                          -0.924  0.35559    
## LightOTHER                                         -2.089  0.03672 *  
## Traffic_ControlSIGNAL-STOP                          2.697  0.00700 ** 
## Work_AreaYES                                        1.093  0.27427    
## WETorDRY                                            0.020  0.98373    
## Rd_FeatureINTERSECTION:Traffic_ControlSIGNAL-STOP  -1.077  0.28150    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 17.97553)
## 
##     Null deviance: 315875  on 17353  degrees of freedom
## Residual deviance: 311335  on 17320  degrees of freedom
## AIC: 99420
## 
## Number of Fisher Scoring iterations: 6
print('AIC')
## [1] "AIC"
AIC(GLMnorm)
## [1] 99420.35
predict<-predict(GLMnorm,newdata=test,type='response')
print('RMSE')
## [1] "RMSE"
sqrt(sum((test$Crash_Score-predict)^2)/nrow(test))
## [1] 4.285803
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
GLMgamma1<-glm(Crash_Score~1,family = Gamma(link='log'),data=train)
stepAIC(GLMgamma1,direction = 'forward',k=log(nrow(train)),scope=list(upper=GLMgamma,lower=GLMgamma1))
## Start:  AIC=93452.1
## Crash_Score ~ 1
## 
##                    Df Deviance   AIC
## + Rd_Class          2   7260.8 93351
## + Traffic_Control   1   7272.3 93368
## + Rd_Feature        1   7274.3 93373
## + Rd_Configuration  4   7293.7 93448
## + Time_of_Day       3   7298.4 93450
## <none>                  7311.8 93452
## + Rd_Character      1   7309.6 93457
## + year              1   7310.3 93458
## + Rd_Surface        1   7310.4 93459
## + Work_Area         1   7311.0 93460
## + Light             5   7295.0 93461
## + WETorDRY          1   7311.6 93461
## + Month            11   7305.3 93544
## 
## Step:  AIC=93341.77
## Crash_Score ~ Rd_Class
## 
##                    Df Deviance   AIC
## + Rd_Feature        1   7242.4 93308
## + Traffic_Control   1   7243.8 93311
## + Time_of_Day       3   7246.2 93336
## <none>                  7260.8 93342
## + Rd_Character      1   7258.5 93346
## + Rd_Surface        1   7258.6 93346
## + year              1   7259.0 93347
## + Light             5   7243.6 93349
## + Work_Area         1   7260.3 93350
## + WETorDRY          1   7260.8 93352
## + Rd_Configuration  4   7259.1 93377
## + Month            11   7255.0 93435
## 
## Step:  AIC=93304.48
## Crash_Score ~ Rd_Class + Rd_Feature
## 
##                    Df Deviance   AIC
## + Time_of_Day       3   7228.4 93300
## + Traffic_Control   1   7238.2 93304
## <none>                  7242.4 93304
## + Rd_Character      1   7240.5 93310
## + year              1   7240.9 93311
## + Rd_Surface        1   7241.1 93311
## + Work_Area         1   7241.8 93313
## + WETorDRY          1   7242.4 93314
## + Light             5   7226.1 93314
## + Rd_Configuration  4   7241.7 93342
## + Month            11   7236.8 93399
## 
## Step:  AIC=93297.94
## Crash_Score ~ Rd_Class + Rd_Feature + Time_of_Day
## 
##                    Df Deviance   AIC
## <none>                  7228.4 93298
## + Traffic_Control   1   7224.5 93298
## + year              1   7226.7 93304
## + Rd_Character      1   7226.9 93304
## + Rd_Surface        1   7227.3 93305
## + Work_Area         1   7227.9 93307
## + WETorDRY          1   7228.4 93308
## + Light             5   7220.8 93328
## + Rd_Configuration  4   7227.6 93335
## + Month            11   7222.8 93392
## 
## Call:  glm(formula = Crash_Score ~ Rd_Class + Rd_Feature + Time_of_Day, 
##     family = Gamma(link = "log"), data = train)
## 
## Coefficients:
##            (Intercept)           Rd_ClassOTHER          Rd_ClassUS HWY  
##               1.907945               -0.090945                0.003077  
## Rd_FeatureINTERSECTION    Time_of_DayOVERNIGHT   Time_of_DayLATE-EARLY  
##               0.075252               -0.136333               -0.032112  
##  Time_of_DayLATE-NIGHT  
##              -0.049669  
## 
## Degrees of Freedom: 17353 Total (i.e. Null);  17347 Residual
## Null Deviance:       7312 
## Residual Deviance: 7228  AIC: 93240
GLMgammaR<-glm(Crash_Score~Rd_Class+Rd_Feature+Time_of_Day+Traffic_Control,family=Gamma(link='log'),data=train)
summary(GLMgammaR)
## 
## Call:
## glm(formula = Crash_Score ~ Rd_Class + Rd_Feature + Time_of_Day + 
##     Traffic_Control, family = Gamma(link = "log"), data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.3199  -0.5542  -0.1433   0.2784   3.3175  
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 1.89608    0.01024 185.147  < 2e-16 ***
## Rd_ClassOTHER              -0.08340    0.01134  -7.352 2.03e-13 ***
## Rd_ClassUS HWY              0.00646    0.01696   0.381 0.703231    
## Rd_FeatureINTERSECTION      0.05082    0.01415   3.590 0.000331 ***
## Time_of_DayOVERNIGHT       -0.13431    0.02678  -5.016 5.32e-07 ***
## Time_of_DayLATE-EARLY      -0.03169    0.01945  -1.629 0.103333    
## Time_of_DayLATE-NIGHT      -0.05011    0.01643  -3.050 0.002293 ** 
## Traffic_ControlSIGNAL-STOP  0.04060    0.01337   3.037 0.002396 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 0.4186809)
## 
##     Null deviance: 7311.8  on 17353  degrees of freedom
## Residual deviance: 7224.5  on 17346  degrees of freedom
## AIC: 93236
## 
## Number of Fisher Scoring iterations: 5
print('AIC')
## [1] "AIC"
AIC(GLMgammaR)
## [1] 93235.58
predict<-predict(GLMgammaR,newdata=test,type='response')
print('RMSE')
## [1] "RMSE"
sqrt(sum((test$Crash_Score-predict)^2)/nrow(test))
## [1] 4.284678
plot(GLMgammaR)

Task 8

GLMgammaRdat<-glm(Crash_Score ~ Rd_Class + Rd_Feature + Time_of_Day+Traffic_Control,family = Gamma(link='log'),data=dat)
summary(GLMgammaRdat)
## 
## Call:
## glm(formula = Crash_Score ~ Rd_Class + Rd_Feature + Time_of_Day + 
##     Traffic_Control, family = Gamma(link = "log"), data = dat)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.3222  -0.5544  -0.1428   0.2770   3.3252  
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 1.890687   0.008846 213.742  < 2e-16 ***
## Rd_ClassOTHER              -0.081323   0.009822  -8.280  < 2e-16 ***
## Rd_ClassUS HWY              0.018101   0.014698   1.232  0.21813    
## Rd_FeatureINTERSECTION      0.053271   0.012258   4.346 1.39e-05 ***
## Time_of_DayOVERNIGHT       -0.117653   0.023295  -5.051 4.44e-07 ***
## Time_of_DayLATE-EARLY      -0.050126   0.016803  -2.983  0.00286 ** 
## Time_of_DayLATE-NIGHT      -0.044273   0.014174  -3.124  0.00179 ** 
## Traffic_ControlSIGNAL-STOP  0.049265   0.011571   4.257 2.08e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 0.4192278)
## 
##     Null deviance: 9740.7  on 23136  degrees of freedom
## Residual deviance: 9615.7  on 23129  degrees of freedom
## AIC: 124305
## 
## Number of Fisher Scoring iterations: 5

Task 9

library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-16
set.seed(42)
X<-model.matrix(Crash_Score~.+Traffic_Control:Rd_Feature,train)
m<-cv.glmnet(x=X,y=train$Crash_Score,family='gaussian',alpha=1)
plot(m)

m.best<-glmnet(x=X,y=train$Crash_Score,family='gaussian',lambda=m$lambda.min,alpha=1)
X.test<-model.matrix(Crash_Score~.+Traffic_Control:Rd_Feature,test)
m.best$beta
## 34 x 1 sparse Matrix of class "dgCMatrix"
##                                                             s0
## (Intercept)                                        .          
## year                                              -0.035230776
## Month2                                             0.157206654
## Month3                                             0.046209463
## Month4                                             .          
## Month5                                             .          
## Month6                                            -0.072368785
## Month7                                            -0.041073747
## Month8                                            -0.006126929
## Month9                                             .          
## Month10                                            0.109130441
## Month11                                            .          
## Month12                                            0.263876244
## Time_of_DayOVERNIGHT                              -0.481383043
## Time_of_DayLATE-EARLY                             -0.090698959
## Time_of_DayLATE-NIGHT                             -0.070069854
## Rd_FeatureINTERSECTION                             0.310714390
## Rd_CharacterCURVE                                 -0.157376229
## Rd_ClassOTHER                                     -0.526174010
## Rd_ClassUS HWY                                     0.021778137
## Rd_ConfigurationONE-WAY                           -0.063368334
## Rd_ConfigurationTWO-WAY-PROTECTED-MEDIAN           0.060092421
## Rd_ConfigurationTWO-WAY-UNPROTECTED-MEDIAN         .          
## Rd_ConfigurationUNKNOWN                            0.044014946
## Rd_SurfaceOTHER                                   -0.143226618
## LightDARK-LIT                                     -0.270482724
## LightDARK-NOT-LIT                                 -0.554771485
## LightDAWN                                          .          
## LightDUSK                                         -0.058549755
## LightOTHER                                        -0.559246300
## Traffic_ControlSIGNAL-STOP                         0.224366735
## Work_AreaYES                                       0.174188162
## WETorDRY                                           .          
## Rd_FeatureINTERSECTION:Traffic_ControlSIGNAL-STOP  .
m.best.predict<-predict(m.best,newx=X.test)
rmse<-sqrt(sum((m.best.predict-test$Crash_Score)^2)/nrow(test))
rmse
## [1] 4.284406
set.seed(42)
X<-model.matrix(Crash_Score~.+Traffic_Control:Rd_Feature,train)
m<-cv.glmnet(x=X,y=train$Crash_Score,family='gaussian',alpha=0)
plot(m)

m.best<-glmnet(x=X,y=train$Crash_Score,family='gaussian',lambda=m$lambda.min,alpha=0)
X.test<-model.matrix(Crash_Score~.+Traffic_Control:Rd_Feature,test)
m.best$beta
## 34 x 1 sparse Matrix of class "dgCMatrix"
##                                                            s0
## (Intercept)                                        .         
## year                                              -0.03760358
## Month2                                             0.15344261
## Month3                                             0.06397486
## Month4                                            -0.01799748
## Month5                                            -0.01975586
## Month6                                            -0.12436708
## Month7                                            -0.10135990
## Month8                                            -0.06693177
## Month9                                            -0.01624658
## Month10                                            0.11120885
## Month11                                           -0.05227089
## Month12                                            0.23526357
## Time_of_DayOVERNIGHT                              -0.47502496
## Time_of_DayLATE-EARLY                             -0.12033859
## Time_of_DayLATE-NIGHT                             -0.11155756
## Rd_FeatureINTERSECTION                             0.24403447
## Rd_CharacterCURVE                                 -0.16884312
## Rd_ClassOTHER                                     -0.41763915
## Rd_ClassUS HWY                                     0.10355680
## Rd_ConfigurationONE-WAY                           -0.11116457
## Rd_ConfigurationTWO-WAY-PROTECTED-MEDIAN           0.10725507
## Rd_ConfigurationTWO-WAY-UNPROTECTED-MEDIAN         0.03625030
## Rd_ConfigurationUNKNOWN                            0.26568994
## Rd_SurfaceOTHER                                   -0.20134860
## LightDARK-LIT                                     -0.24001818
## LightDARK-NOT-LIT                                 -0.50608543
## LightDAWN                                         -0.13232862
## LightDUSK                                         -0.12193123
## LightOTHER                                        -0.60898119
## Traffic_ControlSIGNAL-STOP                         0.21377149
## Work_AreaYES                                       0.26274006
## WETorDRY                                          -0.00117400
## Rd_FeatureINTERSECTION:Traffic_ControlSIGNAL-STOP  0.08556195
m.best.predict<-predict(m.best,newx=X.test)
rmse<-sqrt(sum((m.best.predict-test$Crash_Score)^2)/nrow(test))
rmse
## [1] 4.284918

Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).

The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.